*******************************************************************
* This file generates the baseline dataset for demand estimation using post-office information, since these will be used as counterfactual entry locations. *
*******************************************************************
* 1. Merge the data regarding distances between post offices (i.e. new entry locations) and tracts with the original information on census tracts.
* 2. The data is filtered to contain market-notary combinations which comply with the market catchment area definition. 

*************************
* 0 Set path to files *
*************************
clear all
capture log close
set more off
set trace off
cd $main_directory


****************************************************************************************************************************************************
* 1. Merge the data regarding distances between post offices (i.e. new entry locations) and tracts with the original information on census tracts. *
****************************************************************************************************************************************************

*Load the data on distances to municipalities
clear
use "Generated_data\distance_post_office_Census_centroid_reduced.dta", clear
gen bvdid_num=firm_ID
* Market data
rename market_ID CENSUS_ID
decode CENSUS_ID, gen(CENSUS)
gen nis=substr(CENSUS,7,11)
destring nis, replace
merge m:1 nis using "Generated_data\BE_municipal_data.dta"
drop _merge
drop Income //This is the income based on regional rather than municipal data.

rename CENSUS_ID market_ID

format market_ID %24.0g //to more easily see this with browse

* Generating additional notary-level variables
gen start_not=.
gen NotPerOff=. //Notary data is missing, since there are no notaries in the post-offices.
gen lnNotPerOff=log(NotPerOff)
gen LDE=. //Legal Form is "LDE"
//Define the number of notaries for the added variables
replace NotPerOff=0 if NotPerOff==.
replace LDE=0 if LDE==. //This is the mode
replace start_not=19.49315 if start_not==. //This implies that this will be a new office.
gen Q=0 //Output levels are 0, since there are no notaries in the location.
gen Q1=0 //Output levels are 0, since there are no notaries in the location.
gen Q2=0 //Output levels are 0, since there are no notaries in the location.

*************************************************************************************************************************
* 2. The data is filtered to contain market-notary combinations which comply with the market catchment area definition. *
*************************************************************************************************************************
* Create ranking for closest notaries in market
egen rankdist_market=rank(distance), by (market_ID) unique 
* Check that the market in which a notary is located is always in its catchment area (checking the geocoding).
gen common=(nis==firm_NIS) //nis is local market, firm_NIS is the office location
	
tab arrondjud,gen(arr)

* Calculate market characteristics for the local market of the office
foreach var of varlist popdens arrondjud GermanComm GermanExtra {
	gen temp = `var' if common==1
	egen `var'_not=mean(temp),by(firm_ID)
	drop temp
}

*use arrondjud_not to create dummy whether market is in same arrond as notary
gen arrondjud_same=(arrondjud==arrondjud_not)
*use GermanComm_not and GermanExtra_not to create dummy whether market can go outside its arrond to visit notary
	*GermanComm=1 is arrond Eupen, GermanExtra is part of arrond Liege near Eupen
gen arrondjud_out=0
replace arrondjud_out=1 if GermanComm==1&GermanExtra_not==1 //markets in Eupen can go to part of Liege
replace arrondjud_out=1 if GermanExtra==1&GermanComm_not==1 //markets in part of Liege can go to Eupen 

* Selecting the choice set
* Keep observations that are within 15 km of the municipal centroid, as well as the 5 nearest locations.
drop if distance>15  & rankdist_market>5
* Since in Brussels the choice set is potentially very large, you can drop locations who are not the 15 nearest
drop if arrond_str=="Arrondissement Brussel-Hoofdstad"&rankdist_market>15&common==0
* Drop offices who are not in the same judician district
keep if arrondjud_same==1|arrondjud_out==1
*Given selected choice set, #markets from which firm can collect revenue (catchment area)
egen number_markets = count(market_ID), by (firm_ID)
*Given selected choice set, #firms that are in choice set for a market
egen number_firms = count(firm_ID), by (market_ID)

save "Generated_data/baseline_dataset_counter_post_offices.dta", replace
